prepreprocess data

library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(ggExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(viridis)
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lattice)
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(caret)
library(gains)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data = read.csv("E:/Users/ASUS/Documents/term8/ravesh tahghigh/clean data.csv",header = TRUE)
attach(data)
subj = as.factor(subj)
trial = as.numeric(trial)
s = as.factor(S)
r = as.factor(R)
rt = as.numeric(RT)
correct = ifelse(correct=="TRUE",1,0)
correct = as.factor(correct)
instruction = as.factor(instruction)
new.data = data.frame(subj,trial,s,r,rt,correct,instruction)

#EDA PART

#histogram of response time when instruction = accuracy
accuracy.rt = subset(new.data, instruction == "accuracy")$rt
hist(accuracy.rt, col = "pink",
     main = "histogram of response time when instruction = accuracy",
     xlab = "response time", ylab = "frequency")

#correct vs subject
correct.subj = table(correct,subj)
barplot(correct.subj,
        legend.text = TRUE,
        beside = TRUE,
        main = "number of right and wrong choices for each person",
        xlab = "",
        ylab = "",col=c("darkorchid4","darkorchid1"),
        horiz=T, las=1)

#instruction vs correct
correct.instruction = table(correct,instruction)
correct.instruction.bar = barplot(correct.instruction,
        legend.text = TRUE,
        beside = TRUE,
        main = "number of right and wrong choices for each instruction",
        xlab = "type of instruction",
        ylab = "",col=c("aliceblue","cadetblue1"),
        horiz=F, las=1)

#response time vs correct
true.rt = subset(new.data, correct == "1")$rt
false.rt = subset(new.data, correct == "0")$rt
hist(true.rt, breaks=30,  col=rgb(1,0,0,0.5), xlab="response time",
     ylab="frequency", main="distribution of response time for each choice" )
hist(false.rt, breaks=30, col=rgb(0,0,1,0.5), add=T)
legend("topright", legend=c("TRUE","FALSE"), col=c(rgb(1,0,0,0.5), 
                                rgb(0,0,1,0.5)), pt.cex=2, pch=15)

#response time vs instruction
instruction.rt = table(rt,instruction)
boxplot(rt ~ instruction , col=terrain.colors(4),
        main = "distribution of response time for each instruction type",
        xlab = "instruction",
        ylab = "response time")

#response time vs instruction
ggplot(data=new.data, aes(x=rt, group=instruction, fill=instruction)) +
  geom_density(adjust=1.5, alpha=.4) +
  theme_ipsum() +
  ylab("") +
  xlab("response time") +
  labs(title = "distribution of response time for each type of instruction")

#response time vs instruction vs correct
mean.rt <- aggregate(rt ~ instruction + correct, data = new.data, mean)
mean.rt=data.frame(mean.rt)
mean.rt <- mean.rt %>%
  mutate(text = paste0("response time mean: ",rt))
p <- ggplot(mean.rt, aes(instruction, correct, fill= rt, text=text)) + 
  geom_tile() +
  theme_ipsum()
ggplotly(p, tooltip="text")
#test 
chisq.test(subj,correct)
## 
##  Pearson's Chi-squared test
## 
## data:  subj and correct
## X-squared = 971.2, df = 18, p-value < 2.2e-16
chisq.test(subj,instruction)
## 
##  Pearson's Chi-squared test
## 
## data:  subj and instruction
## X-squared = 56.074, df = 36, p-value = 0.01763
chisq.test(instruction,correct)
## 
##  Pearson's Chi-squared test
## 
## data:  instruction and correct
## X-squared = 219.72, df = 2, p-value < 2.2e-16

definition train and validation and test data

set.seed(42)
train_rows=sample(nrow(new.data),9500)
train_data=new.data[train_rows,]
table(train_data$correct)
## 
##    0    1 
## 1510 7990
validation_rows=sample(setdiff(row.names(new.data),train_rows),4000)
validation_data=new.data[validation_rows,]
table(validation_data$correct)
## 
##    0    1 
##  663 3337
test_rows=setdiff(row.names(new.data),c(train_rows,validation_rows))
test_data=new.data[test_rows,]
table(test_data$correct)
## 
##    0    1 
##  351 1967

logistic regression models for “correct” variable

model=glm(correct~subj+rt+instruction,data=train_data,family="binomial")
predict.model.validation=predict(model,validation_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(predict.model.validation>0.5,1,0)),
  reference=validation_data$correct)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0    8    9
##          1  655 3328
##                                           
##                Accuracy : 0.834           
##                  95% CI : (0.8221, 0.8454)
##     No Information Rate : 0.8342          
##     P-Value [Acc > NIR] : 0.5273          
##                                           
##                   Kappa : 0.0154          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.01207         
##             Specificity : 0.99730         
##          Pos Pred Value : 0.47059         
##          Neg Pred Value : 0.83555         
##              Prevalence : 0.16575         
##          Detection Rate : 0.00200         
##    Detection Prevalence : 0.00425         
##       Balanced Accuracy : 0.50468         
##                                           
##        'Positive' Class : 0               
## 
#over fitting check
predict.model.train=predict(model,train_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(predict.model.train>0.5,1,0)),
  reference=train_data$correct)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0   22   22
##          1 1488 7968
##                                           
##                Accuracy : 0.8411          
##                  95% CI : (0.8335, 0.8484)
##     No Information Rate : 0.8411          
##     P-Value [Acc > NIR] : 0.5069          
##                                           
##                   Kappa : 0.0195          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.014570        
##             Specificity : 0.997247        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.842640        
##              Prevalence : 0.158947        
##          Detection Rate : 0.002316        
##    Detection Prevalence : 0.004632        
##       Balanced Accuracy : 0.505908        
##                                           
##        'Positive' Class : 0               
## 
#step methods
#backward
model.back=model %>% stats::step(direction = "backward")
## Start:  AIC=7567.55
## correct ~ subj + rt + instruction
## 
##               Df Deviance    AIC
## <none>             7523.6 7567.6
## - rt           1   7546.5 7588.5
## - instruction  2   7617.8 7657.8
## - subj        18   8183.0 8191.0
model.back.pred=predict(model.back,validation_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(model.back.pred>0.5,1,0)),
  reference=validation_data$correct)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0    8    9
##          1  655 3328
##                                           
##                Accuracy : 0.834           
##                  95% CI : (0.8221, 0.8454)
##     No Information Rate : 0.8342          
##     P-Value [Acc > NIR] : 0.5273          
##                                           
##                   Kappa : 0.0154          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.01207         
##             Specificity : 0.99730         
##          Pos Pred Value : 0.47059         
##          Neg Pred Value : 0.83555         
##              Prevalence : 0.16575         
##          Detection Rate : 0.00200         
##    Detection Prevalence : 0.00425         
##       Balanced Accuracy : 0.50468         
##                                           
##        'Positive' Class : 0               
## 
#forward
model.for=model %>% stats::step(direction = "forward")
## Start:  AIC=7567.55
## correct ~ subj + rt + instruction
model.for.pred=predict(model.for,validation_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(model.for.pred>0.5,1,0)),
  reference=validation_data$correct)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0    8    9
##          1  655 3328
##                                           
##                Accuracy : 0.834           
##                  95% CI : (0.8221, 0.8454)
##     No Information Rate : 0.8342          
##     P-Value [Acc > NIR] : 0.5273          
##                                           
##                   Kappa : 0.0154          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.01207         
##             Specificity : 0.99730         
##          Pos Pred Value : 0.47059         
##          Neg Pred Value : 0.83555         
##              Prevalence : 0.16575         
##          Detection Rate : 0.00200         
##    Detection Prevalence : 0.00425         
##       Balanced Accuracy : 0.50468         
##                                           
##        'Positive' Class : 0               
## 
#both
model.both=model %>% stats::step(direction = "both")
## Start:  AIC=7567.55
## correct ~ subj + rt + instruction
## 
##               Df Deviance    AIC
## <none>             7523.6 7567.6
## - rt           1   7546.5 7588.5
## - instruction  2   7617.8 7657.8
## - subj        18   8183.0 8191.0
model.both.pred=predict(model.both,validation_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(model.both.pred>0.5,1,0)),
  reference=validation_data$correct)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0    8    9
##          1  655 3328
##                                           
##                Accuracy : 0.834           
##                  95% CI : (0.8221, 0.8454)
##     No Information Rate : 0.8342          
##     P-Value [Acc > NIR] : 0.5273          
##                                           
##                   Kappa : 0.0154          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.01207         
##             Specificity : 0.99730         
##          Pos Pred Value : 0.47059         
##          Neg Pred Value : 0.83555         
##              Prevalence : 0.16575         
##          Detection Rate : 0.00200         
##    Detection Prevalence : 0.00425         
##       Balanced Accuracy : 0.50468         
##                                           
##        'Positive' Class : 0               
## 
#lift chart
logit.reg.lift=lift(relevel(as.factor(correct),ref="1")~
                      predict.model.validation,data=validation_data)
xyplot(logit.reg.lift,plot="gain")

#decile lift chart
gain=gains(as.numeric(validation_data$correct),predict.model.validation)
heights=gain$mean.resp/mean(as.numeric(validation_data$correct))
decile.chart=barplot(heights,names.arg=gain$depth,
                     xlab="percentile",ylab="mean response")

#roc plot
r=roc(validation_data$correct,predict.model.validation)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(r)

auc(r)
## Area under the curve: 0.7101

gamma regression models for “response time” variable

gamma.model = glm(rt~correct+instruction,data=train_data,
                       family = Gamma(link = "identity"))
predict.gamma.model.validation=predict(gamma.model,
                                       validation_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(predict.gamma.model.validation>0.5,1,0)),
  reference=as.factor(ifelse(validation_data$rt>0.5,1,0)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1941  684
##          1  764  611
##                                           
##                Accuracy : 0.638           
##                  95% CI : (0.6229, 0.6529)
##     No Information Rate : 0.6762          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.1864          
##                                           
##  Mcnemar's Test P-Value : 0.03789         
##                                           
##             Sensitivity : 0.7176          
##             Specificity : 0.4718          
##          Pos Pred Value : 0.7394          
##          Neg Pred Value : 0.4444          
##              Prevalence : 0.6763          
##          Detection Rate : 0.4853          
##    Detection Prevalence : 0.6562          
##       Balanced Accuracy : 0.5947          
##                                           
##        'Positive' Class : 0               
## 
#over fitting check
predict.gamma.model.train=predict(gamma.model,train_data,type="response")
confusionMatrix(
  data=as.factor(ifelse(predict.gamma.model.train>0.5,1,0)),
  reference=as.factor(ifelse(train_data$rt>0.5,1,0)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4766 1576
##          1 1749 1409
##                                           
##                Accuracy : 0.65            
##                  95% CI : (0.6403, 0.6596)
##     No Information Rate : 0.6858          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.2004          
##                                           
##  Mcnemar's Test P-Value : 0.002856        
##                                           
##             Sensitivity : 0.7315          
##             Specificity : 0.4720          
##          Pos Pred Value : 0.7515          
##          Neg Pred Value : 0.4462          
##              Prevalence : 0.6858          
##          Detection Rate : 0.5017          
##    Detection Prevalence : 0.6676          
##       Balanced Accuracy : 0.6018          
##                                           
##        'Positive' Class : 0               
## 
#visualization of gamma model
# shape: 1 divided by dispersion parameter
m0_shape <- 1/0.08938668
# scale: mean/shape
m0_scale <- sum(coef(gamma.model))/m0_shape
hist(rt, breaks = 40, freq = FALSE)
curve(dgamma(x, shape = m0_shape, scale = m0_scale), 
      from = 0.2, to = 1.4, col = "red", add = TRUE)

#compare simulation of this gamma model with real data
sims1 <- simulate(gamma.model, nsim = 1000)
plot(density(rt), 
     main = "Simulated data for gamma model")
for(i in 1:50)lines(density(sims1[[i]]), col="green")